Library Requirements¶

In [1]:
!pip install pydeck
Requirement already satisfied: pydeck in /usr/local/lib/python3.11/dist-packages (0.9.1)
Requirement already satisfied: jinja2>=2.10.1 in /usr/local/lib/python3.11/dist-packages (from pydeck) (3.1.5)
Requirement already satisfied: numpy>=1.16.4 in /usr/local/lib/python3.11/dist-packages (from pydeck) (1.26.4)
Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2>=2.10.1->pydeck) (3.0.2)
In [2]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import seaborn as sns
import pydeck as pdk
from shapely.geometry import Point, MultiPoint
import statsmodels.formula.api as smf

Pre-Processing¶

  1. Read the CSV file and explore the datasets.
  2. Remove NA values.
  3. Convert the column from object to boolean type of variable or better to read.
In [3]:
# Read the CSV files
df = pd.read_csv("real-estate-data.csv")
df.head()
Out[3]:
id_ ward beds baths DEN size parking exposure D_mkt building_age maint price lt lg
0 219129 W13 3.0 3 No 1500-1999 sqft N No 16.0 9 1087.0 1821000.0 43.617997 -79.392383
1 757581 W13 1.0 1 YES 500-999 sqft Yes We 23.0 3 469.0 613000.0 43.648968 -79.390031
2 404196 W13 2.0 2 YES 500-999 sqft Yes We 12.0 1 767.0 838000.0 43.641045 -79.375436
3 821441 W13 2.0 2 No 500-999 sqft Yes S 10.0 25 827.0 935000.0 43.642122 -79.370250
4 612090 W10 2.0 1 No NaN N S 5.0 1 NaN 1328000.0 43.692210 -79.365015
In [4]:
# Read the Toronto neighbourhood GeoJSON files
# Load the GeoJSON file into a GeoDataFrame
neighborhoods = gpd.read_file("Neighbourhoods - 4326.geojson")
neighborhoods.head()
Out[4]:
_id AREA_ID AREA_ATTR_ID PARENT_AREA_ID AREA_SHORT_CODE AREA_LONG_CODE AREA_NAME AREA_DESC CLASSIFICATION CLASSIFICATION_CODE OBJECTID geometry
0 1 2502366 26022881 0 174 174 South Eglinton-Davisville South Eglinton-Davisville (174) Not an NIA or Emerging Neighbourhood NA 17824737.0 MULTIPOLYGON (((-79.38635 43.69783, -79.38623 ...
1 2 2502365 26022880 0 173 173 North Toronto North Toronto (173) Not an NIA or Emerging Neighbourhood NA 17824753.0 MULTIPOLYGON (((-79.39744 43.70693, -79.39837 ...
2 3 2502364 26022879 0 172 172 Dovercourt Village Dovercourt Village (172) Not an NIA or Emerging Neighbourhood NA 17824769.0 MULTIPOLYGON (((-79.43411 43.66015, -79.43537 ...
3 4 2502363 26022878 0 171 171 Junction-Wallace Emerson Junction-Wallace Emerson (171) Not an NIA or Emerging Neighbourhood NA 17824785.0 MULTIPOLYGON (((-79.4387 43.66766, -79.43841 4...
4 5 2502362 26022877 0 170 170 Yonge-Bay Corridor Yonge-Bay Corridor (170) Not an NIA or Emerging Neighbourhood NA 17824801.0 MULTIPOLYGON (((-79.38404 43.64497, -79.38502 ...
In [5]:
# Shape of the dataset and data types
print(df.shape)
print(df.dtypes)
(3042, 14)
id_               int64
ward             object
beds            float64
baths             int64
DEN              object
size             object
parking          object
exposure         object
D_mkt           float64
building_age      int64
maint           float64
price           float64
lt              float64
lg              float64
dtype: object
In [6]:
# Count missing values for each column
missing_values = df.isna().sum()
print(missing_values)
id_              0
ward             0
beds            54
baths            0
DEN              0
size            53
parking          0
exposure         0
D_mkt           93
building_age     0
maint           45
price           61
lt               0
lg               0
dtype: int64
In [7]:
# Calculate which neighbourhood a unit belongs to.
# Create a geometry column from the longitude and latitude values:
df['geometry'] = df.apply(lambda row: Point(row['lg'], row['lt']), axis=1)

# Convert the DataFrame to a GeoDataFrame with WGS84 CRS
gdf = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326")

# Ensure the neighbourhoods GeoDataFrame is in the same CRS as our points
if neighborhoods.crs != gdf.crs:
    neighborhoods = neighborhoods.to_crs(gdf.crs)

# Perform a spatial join: assign each point the attributes of the neighbourhood it falls in.
# Here we assume the neighbourhood name is stored in a column called 'neighbourhood' in the GeoJSON.
# Adjust the column name accordingly if different.
gdf = gpd.sjoin(gdf, neighborhoods[['AREA_NAME', 'geometry']], how="left", predicate="within")

# Add the 'neighbourhood' column back to the original DataFrame
df['AREA_NAME'] = gdf['AREA_NAME']
df.drop(columns='geometry', inplace=True)
In [8]:
# Drop NA values
df = df.dropna()
df.shape
Out[8]:
(2749, 15)
In [9]:
# Check the unique values from each column to get the sense of the dataset

for col in df.columns:
    print(f"Unique values in '{col}':")
    print(df[col].unique())
    print("-" * 40)
Unique values in 'id_':
[219129 757581 404196 ... 476334 413482 484367]
----------------------------------------
Unique values in 'ward':
['W13' 'W10' 'W11']
----------------------------------------
Unique values in 'beds':
[3. 1. 2. 0.]
----------------------------------------
Unique values in 'baths':
[3 1 2]
----------------------------------------
Unique values in 'DEN':
['No' 'YES']
----------------------------------------
Unique values in 'size':
['1500-1999 sqft' '500-999 sqft' '2000-2499 sqft' '1000-1499 sqft'
 '0-499 sqft' '5500-3999 sqft' '2500-2999 sqft' '3000-3499 sqft'
 '4000+ sqft']
----------------------------------------
Unique values in 'parking':
['N' 'Yes']
----------------------------------------
Unique values in 'exposure':
['No' 'We' 'S' 'E']
----------------------------------------
Unique values in 'D_mkt':
[ 16.  23.  12.  10.   1.   4.   5.  13.  19.  37.   3.  54.   2.   6.
  24.  27.   0.  28.  33.  30.  45.  26.  43.  17.  15.   8.  29.  35.
  20.   7.  34.  21.  14.  11.   9.  22.  31.  40.  46.  32.  74.  36.
  18.  25.  38.  55.  87.  47.  62.  44.  53.  78.  42.  49.  39.  67.
  59.  83.  57.  48.  41.  51. 169.  64.  73.  58.  79.  52.  65.  85.
  50.  80.  63.  56.  81.  86.  68.  89.  66.  61.  84.  70.  69.]
----------------------------------------
Unique values in 'building_age':
[ 9  3  1 25 12  8  4 11  0 19  7 16 14  5  6 13 10  2 15 34 50 23 56 20
 29 31 22 21 17 39 24 37 52 18 46 47 33 28 42 32 30 26 48 27 49 58 55 62
 36 59 60 61 40 35 45 41 44 43 75 66 69 65 64 53 38]
----------------------------------------
Unique values in 'maint':
[1087.  469.  767. ... 1109.  892. 1183.]
----------------------------------------
Unique values in 'price':
[1821000.  613000.  838000. ... 1202000. 1409000. 1386000.]
----------------------------------------
Unique values in 'lt':
[43.61799707 43.64896846 43.64104467 ... 43.63682962 43.66931697
 43.64853276]
----------------------------------------
Unique values in 'lg':
[-79.39238293 -79.39003091 -79.37543576 ... -79.41218711 -79.40804663
 -79.35875259]
----------------------------------------
Unique values in 'AREA_NAME':
['St Lawrence-East Bayfront-The Islands' 'Wellington Place'
 'Kensington-Chinatown' 'Cabbagetown-South St.James Town'
 'Harbourfront-CityPlace' 'Rosedale-Moore Park' 'South Parkdale'
 'Regent Park' 'Palmerston-Little Italy' 'Annex' 'North St.James Town'
 'Fort York-Liberty Village' 'Leaside-Bennington' 'Moss Park'
 'Bay-Cloverhill' 'Dovercourt Village' 'Trinity-Bellwoods'
 'Downtown Yonge East' 'Yonge-Bay Corridor' 'Church-Wellesley'
 'University' 'West Queen West']
----------------------------------------
In [10]:
# Convert boolean-like categorical variables to binary format
df["DEN"] = df["DEN"].map({"YES": 1, "No": 0}).astype(int)
df["parking"] = df["parking"].map({"Yes": 1, "N": 0}).astype(int)
df["exposure"] = df["exposure"].map({"No": "North", "We": "West", "S": "South", "E": "East"})
<ipython-input-10-2a5a80bf2f33>:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["DEN"] = df["DEN"].map({"YES": 1, "No": 0}).astype(int)
<ipython-input-10-2a5a80bf2f33>:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["parking"] = df["parking"].map({"Yes": 1, "N": 0}).astype(int)
<ipython-input-10-2a5a80bf2f33>:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["exposure"] = df["exposure"].map({"No": "North", "We": "West", "S": "South", "E": "East"})
In [11]:
# After doing the pre-processing, unique values of our data types

for col in df.columns:
    print(f"Unique values in '{col}':")
    print(df[col].unique())
    print("-" * 40)
Unique values in 'id_':
[219129 757581 404196 ... 476334 413482 484367]
----------------------------------------
Unique values in 'ward':
['W13' 'W10' 'W11']
----------------------------------------
Unique values in 'beds':
[3. 1. 2. 0.]
----------------------------------------
Unique values in 'baths':
[3 1 2]
----------------------------------------
Unique values in 'DEN':
[0 1]
----------------------------------------
Unique values in 'size':
['1500-1999 sqft' '500-999 sqft' '2000-2499 sqft' '1000-1499 sqft'
 '0-499 sqft' '5500-3999 sqft' '2500-2999 sqft' '3000-3499 sqft'
 '4000+ sqft']
----------------------------------------
Unique values in 'parking':
[0 1]
----------------------------------------
Unique values in 'exposure':
['North' 'West' 'South' 'East']
----------------------------------------
Unique values in 'D_mkt':
[ 16.  23.  12.  10.   1.   4.   5.  13.  19.  37.   3.  54.   2.   6.
  24.  27.   0.  28.  33.  30.  45.  26.  43.  17.  15.   8.  29.  35.
  20.   7.  34.  21.  14.  11.   9.  22.  31.  40.  46.  32.  74.  36.
  18.  25.  38.  55.  87.  47.  62.  44.  53.  78.  42.  49.  39.  67.
  59.  83.  57.  48.  41.  51. 169.  64.  73.  58.  79.  52.  65.  85.
  50.  80.  63.  56.  81.  86.  68.  89.  66.  61.  84.  70.  69.]
----------------------------------------
Unique values in 'building_age':
[ 9  3  1 25 12  8  4 11  0 19  7 16 14  5  6 13 10  2 15 34 50 23 56 20
 29 31 22 21 17 39 24 37 52 18 46 47 33 28 42 32 30 26 48 27 49 58 55 62
 36 59 60 61 40 35 45 41 44 43 75 66 69 65 64 53 38]
----------------------------------------
Unique values in 'maint':
[1087.  469.  767. ... 1109.  892. 1183.]
----------------------------------------
Unique values in 'price':
[1821000.  613000.  838000. ... 1202000. 1409000. 1386000.]
----------------------------------------
Unique values in 'lt':
[43.61799707 43.64896846 43.64104467 ... 43.63682962 43.66931697
 43.64853276]
----------------------------------------
Unique values in 'lg':
[-79.39238293 -79.39003091 -79.37543576 ... -79.41218711 -79.40804663
 -79.35875259]
----------------------------------------
Unique values in 'AREA_NAME':
['St Lawrence-East Bayfront-The Islands' 'Wellington Place'
 'Kensington-Chinatown' 'Cabbagetown-South St.James Town'
 'Harbourfront-CityPlace' 'Rosedale-Moore Park' 'South Parkdale'
 'Regent Park' 'Palmerston-Little Italy' 'Annex' 'North St.James Town'
 'Fort York-Liberty Village' 'Leaside-Bennington' 'Moss Park'
 'Bay-Cloverhill' 'Dovercourt Village' 'Trinity-Bellwoods'
 'Downtown Yonge East' 'Yonge-Bay Corridor' 'Church-Wellesley'
 'University' 'West Queen West']
----------------------------------------

Exploratory Data Analysis¶

Distribution of House Prices¶

In [12]:
# Set a modern style
sns.set_theme(style="whitegrid")

# Create an enhanced histogram with KDE (Kernel Density Estimation)
plt.figure(figsize=(12, 6))
ax = sns.histplot(df["price"], bins=30, kde=True, color="dodgerblue", edgecolor="black", alpha=0.7)

# Format x-axis for price display
ax.xaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'${x:,.0f}'))

# Customize appearance
plt.xlabel("Price ($)", fontsize=13, fontweight="bold")
plt.ylabel("Frequency", fontsize=13, fontweight="bold")
plt.title("Distribution of Property Prices", fontsize=15, fontweight="bold", pad=15)

# Remove top and right borders for a cleaner look
sns.despine()

# Show the plot
plt.show()
No description has been provided for this image

Distribution of Property Prices by Bedrooms and Den Presence¶

In [13]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker

# Ensure 'DEN' column exists and fill NaN values if necessary
if "DEN" in df.columns:
    df["DEN"] = df["DEN"].fillna(0)  # Replace missing values with 0
    df["DEN"] = df["DEN"].map({1: "With Den", 0: "No Den"})  # Convert to readable format
else:
    raise KeyError("Column 'DEN' not found in DataFrame")

# Set an enhanced modern style
sns.set_theme(style="whitegrid")

# Create the violin plot
plt.figure(figsize=(12, 6))
ax = sns.violinplot(
    x="beds", y="price", hue="DEN", data=df, palette="coolwarm", split=True, inner="quartile", linewidth=1.5, saturation=0.9
)

# Format y-axis for price display
ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'${x:,.0f}'))

# Customize appearance
plt.xlabel("Number of Bedrooms", fontsize=13, fontweight="bold")
plt.ylabel("Price ($)", fontsize=13, fontweight="bold")
plt.title("Distribution of Property Prices by Bedrooms and Den Presence", fontsize=15, fontweight="bold", pad=15)

# Improve x-axis readability
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Add legend with better positioning
plt.legend(title="Den Presence", title_fontsize="12", loc="upper left", bbox_to_anchor=(1, 1))

# Remove top and right borders for a cleaner look
sns.despine()

# Show the plot
plt.show()
No description has been provided for this image

Distribution of Exposure vs Price¶

In [14]:
# Set a modern style
sns.set_theme(style="whitegrid")

# Create an enhanced boxplot
plt.figure(figsize=(10, 6))
ax = sns.boxplot(
    x="exposure", y="price", data=df, palette="coolwarm", linewidth=1.5, notch=True, width=0.6
)

# Format y-axis for price display
ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'${x:,.0f}'))

# Customize appearance
plt.xlabel("Exposure", fontsize=13, fontweight="bold")
plt.ylabel("Price ($)", fontsize=13, fontweight="bold")
plt.title("Property Prices by Exposure Direction", fontsize=15, fontweight="bold", pad=15)

# Improve x-axis readability
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Remove top and right borders for a cleaner look
sns.despine()

# Show the plot
plt.show()
<ipython-input-14-9da728f6007c>:6: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.boxplot(
No description has been provided for this image

Distribution of Day in the Market vs Price¶

In [15]:
# Set a modern style
sns.set_theme(style="whitegrid")

# Create an enhanced scatter plot
plt.figure(figsize=(10, 6))
ax = sns.scatterplot(
    x=df["D_mkt"], y=df["price"], alpha=0.6, color="purple", edgecolor="black"
)

# Add a trend line using regression (without confidence interval for clarity)
sns.regplot(
    x=df["D_mkt"], y=df["price"], scatter=False, color="black", line_kws={"linestyle": "dashed"}
)

# Format y-axis for price display
ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'${x:,.0f}'))

# Customize appearance
plt.xlabel("Days on Market", fontsize=13, fontweight="bold")
plt.ylabel("Price ($)", fontsize=13, fontweight="bold")
plt.title("Days on Market vs Property Price", fontsize=15, fontweight="bold", pad=15)

# Improve x-axis readability
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Remove top and right borders for a cleaner look
sns.despine()

# Show the plot
plt.show()
No description has been provided for this image

Distribution of Age of Building vs Property Price¶

In [16]:
# Set a modern style
sns.set_theme(style="whitegrid")

# Create an enhanced scatter plot for Building Age vs Price
plt.figure(figsize=(10, 6))
ax = sns.scatterplot(
    x=df["building_age"], y=df["price"], alpha=0.6, color="red", edgecolor="black"
)

# Add a trend line using regression (without confidence interval for clarity)
sns.regplot(
    x=df["building_age"], y=df["price"], scatter=False, color="black", line_kws={"linestyle": "dashed"}
)

# Format y-axis for price display
ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'${x:,.0f}'))

# Customize appearance
plt.xlabel("Building Age (Years)", fontsize=13, fontweight="bold")
plt.ylabel("Price ($)", fontsize=13, fontweight="bold")
plt.title("Building Age vs Property Price", fontsize=15, fontweight="bold", pad=15)

# Improve x-axis readability
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Remove top and right borders for a cleaner look
sns.despine()

# Show the plot
plt.show()
No description has been provided for this image
In [17]:
# Set a modern style
sns.set_theme(style="whitegrid")

# Create FacetGrid for Building Age vs Price divided by number of bedrooms
g = sns.FacetGrid(df, col="beds", col_wrap=3, height=4, sharex=True, sharey=True)
g.map_dataframe(sns.scatterplot, x="building_age", y="price", alpha=0.6, color="red", edgecolor="black")

# Add regression lines to each facet
g.map_dataframe(sns.regplot, x="building_age", y="price", scatter=False, color="black", line_kws={"linestyle": "dashed"})

# Format y-axis for price display
for ax in g.axes.flat:
    ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'${x:,.0f}'))

# Customize titles and labels
g.set_axis_labels("Building Age (Years)", "Price ($)")
g.set_titles(col_template="Beds: {col_name}")

# Improve readability
plt.subplots_adjust(top=0.85)
g.fig.suptitle("Building Age vs Property Price Across Different Bedroom Counts", fontsize=15, fontweight="bold")

# Show the plot
plt.show()
No description has been provided for this image

Distribution of Parking vs Price¶

In [18]:
# Set a modern style
sns.set_theme(style="whitegrid")

# Convert 'parking' column to a readable format
df["parking"] = df["parking"].map({1: "With Parking", 0: "No Parking"})

# Create an enhanced boxplot
plt.figure(figsize=(8, 6))
ax = sns.boxplot(
    x="parking", y="price", data=df, palette="Set2", linewidth=1.5, notch=True, width=0.6
)

# Format y-axis for price display
ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'${x:,.0f}'))

# Customize appearance
plt.xlabel("Parking Availability", fontsize=13, fontweight="bold")
plt.ylabel("Price ($)", fontsize=13, fontweight="bold")
plt.title("Impact of Parking Availability on Property Prices", fontsize=15, fontweight="bold", pad=15)

# Improve x-axis readability
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Remove top and right borders for a cleaner look
sns.despine()

# Show the plot
plt.show()
<ipython-input-18-e15dd4ec88be>:9: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.boxplot(
No description has been provided for this image

Distribution of Property Prices by Size¶

In [19]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker

size_order = [
    "0-499 sqft", "500-999 sqft", "1000-1499 sqft", "1500-1999 sqft",
    "2000-2499 sqft", "2500-2999 sqft", "3000-3499 sqft", "4000+ sqft"
]

# Define color mapping for each size category
size_colors = {
    "0-499 sqft": "red",
    "500-999 sqft": "blue",
    "1000-1499 sqft": "green",
    "1500-1999 sqft": "purple",
    "2000-2499 sqft": "orange",
    "2500-2999 sqft": "pink",
    "3000-3499 sqft": "cyan",
    "4000+ sqft": "brown"
}

# Set a modern style
sns.set_theme(style="whitegrid")

# Create the violin plot with an enhanced color palette
plt.figure(figsize=(12, 6))
ax = sns.violinplot(
    x="size", y="price", data=df, order=size_order, palette="coolwarm", inner="quartile", linewidth=1.2
)

# Format y-axis to display full price values (e.g., $500K, $1M, etc.)
ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'${x:,.0f}'))

# Customize the appearance
plt.xlabel("Size Category", fontsize=12, fontweight="bold")
plt.ylabel("Price ($)", fontsize=12, fontweight="bold")
plt.title("Distribution of Property Prices by Size", fontsize=14, fontweight="bold", pad=15)

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, fontsize=10)
plt.yticks(fontsize=10)

# Remove top and right borders for a cleaner look
sns.despine()

# Show the plot
plt.show()
<ipython-input-19-ac7d038b7078>:27: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.violinplot(
No description has been provided for this image

High Correlation between Beds and Baths¶

In [20]:
import numpy as np

correlation = df["beds"].corr(df["baths"])
print(f"Correlation between beds and baths: {correlation:.2f}")
Correlation between beds and baths: 0.75

Scatterplot of Maintence Cost vs Property Price¶

In [21]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker

# Set a modern style
sns.set_theme(style="whitegrid")

# Create the scatter plot
plt.figure(figsize=(10, 6))
ax = sns.scatterplot(x="maint", y="price", data=df, alpha=0.6, color="dodgerblue")

# Format y-axis to display full price values
ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'${x:,.0f}'))

# Customize appearance
plt.xlabel("Maintenance Cost ($)", fontsize=12, fontweight="bold")
plt.ylabel("Price ($)", fontsize=12, fontweight="bold")
plt.title("Scatter Plot of Maintenance Cost vs Property Price", fontsize=14, fontweight="bold", pad=15)

# Improve grid visibility
plt.grid(True, linestyle="--", alpha=0.6)

# Show the plot
plt.show()
No description has been provided for this image
In [22]:
import folium
from folium.plugins import MarkerCluster

# Initialize a map centered around Toronto
m = folium.Map(location=[df["lt"].mean(), df["lg"].mean()], zoom_start=12)

# Define a price-based color scale
def price_color(price):
    if price < 500000:
        return "green"
    elif 500000 <= price < 1000000:
        return "blue"
    elif 1000000 <= price < 1500000:
        return "orange"
    else:
        return "red"

# Add markers for each property
marker_cluster = MarkerCluster().add_to(m)
for _, row in df.iterrows():
    folium.Marker(
        location=[row["lt"], row["lg"]],
        popup=f"Price: ${row['price']:,.0f}\nBeds: {row['beds']}\nBaths: {row['baths']}",
        icon=folium.Icon(color=price_color(row["price"]))
    ).add_to(marker_cluster)

# Display map
m.save("property_map.html")  # Saves map as an HTML file (open in browser)
m
Out[22]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [23]:
# # 1. Load your CSV data (ensure "real-estate-data.csv" is in your working directory)
# df = pd.read_csv("real-estate-data.csv")
# df = df.dropna(subset=["lt", "lg", "price", "ward"])

# 2. Define an elevation scale for the bars
elevation_scale = 0.0002

# 3. Define a function to map price to a color for the 3D bars
def price_to_color(price):
    min_price, max_price = 500000, 2500000
    ratio = (price - min_price) / (max_price - min_price + 1e-9)
    ratio = max(0, min(ratio, 1))
    r = int(150 + ratio * (255 - 150))
    g = int(ratio * 255)
    b = int(150 - ratio * 150)
    return [r, g, b]

df["bar_color"] = df["price"].apply(price_to_color)

# 4. Create a ColumnLayer for property data (bars colored by price)
column_layer = pdk.Layer(
    "ColumnLayer",
    data=df,
    get_position=["lg", "lt"],    # [longitude, latitude]
    get_elevation="price",         # Height based on price
    elevation_scale=elevation_scale,
    radius=25,                     # Skinnier columns
    get_fill_color="bar_color",    # Color by price
    pickable=True,
    extruded=True,
    auto_highlight=True,
)

# 5. Compute maximum elevation so we can position ward labels above the bars
max_price_val = df["price"].max()
max_elevation = max_price_val * elevation_scale
label_elevation = max_elevation + 50  # Place label 50 units above the tallest bar

# 6. Group by ward and compute a convex hull (or fallback rectangle) for each ward
ward_regions = []
# Define a color mapping for the wards:
ward_color_map = {
    "W10": [255, 99, 71],   # Tomato (e.g., Spadina-Fort York)
    "W11": [60, 179, 113],  # Medium Sea Green (e.g., University-Rosedale)
    "W13": [65, 105, 225]   # Royal Blue (e.g., Toronto Centre)
}

unique_wards = df["ward"].unique()
for ward in unique_wards:
    group = df[df["ward"] == ward]
    points = list(zip(group["lg"], group["lt"]))
    if len(points) < 3:
        # Fallback: create a rectangle covering the points
        min_lg = group["lg"].min()
        max_lg = group["lg"].max()
        min_lt = group["lt"].min()
        max_lt = group["lt"].max()
        polygon = [
            [min_lg, min_lt],
            [min_lg, max_lt],
            [max_lg, max_lt],
            [max_lg, min_lt],
            [min_lg, min_lt]  # Close the polygon
        ]
        centroid_lg = (min_lg + max_lg) / 2
        centroid_lt = (min_lt + max_lt) / 2
    else:
        convex_hull = MultiPoint(points).convex_hull
        polygon = list(convex_hull.exterior.coords)
        centroid_lg = convex_hull.centroid.x
        centroid_lt = convex_hull.centroid.y
    fill_color = ward_color_map.get(ward, [200, 200, 200])
    ward_regions.append({
        "ward": ward,
        "polygon": polygon,
        "fill_color": fill_color,
        "center_lg": centroid_lg,
        "center_lt": centroid_lt
    })

df_wards = pd.DataFrame(ward_regions)

# 7. Create a PolygonLayer for the ward regions (colored only where data exists)
polygon_layer = pdk.Layer(
    "PolygonLayer",
    data=df_wards,
    get_polygon="polygon",
    get_fill_color="fill_color",
    get_line_color=[255, 255, 255],
    pickable=False,     # Disable tooltips on polygons
    stroked=True,
    extruded=False,     # Flat polygons
    opacity=0.3,        # Semi-transparent for a polished look
)

# 8. Create a TextLayer for ward labels, positioned at each ward's centroid at a fixed elevation
text_layer = pdk.Layer(
    "TextLayer",
    data=df_wards,
    get_position=["center_lg", "center_lt", label_elevation],
    get_text="ward",
    get_color=[255, 255, 255],
    get_size=32,                   # Large font for presentation
    get_alignment_baseline="'bottom'",
    pickable=False,
)

# 9. Set the initial view centered on your data points
center_lat = df["lt"].mean()
center_lg = df["lg"].mean()
view_state = pdk.ViewState(
    latitude=center_lat,
    longitude=center_lg,
    zoom=12,
    pitch=50,  # Tilt for a good 3D perspective
)

# 10. Create the Deck with three layers:
#      - PolygonLayer: colored ward regions (covering only the area with data)
#      - ColumnLayer: 3D property bars (colored by price)
#      - TextLayer: ward labels above the bars
deck = pdk.Deck(
    layers=[polygon_layer, column_layer, text_layer],
    initial_view_state=view_state,
    map_style="https://basemaps.cartocdn.com/gl/dark-matter-gl-style/style.json",
    tooltip={"text": "Price: {price}\nBeds: {beds}\nBaths: {baths}\nWard: {ward}"}
)

# 11. Export the map to an HTML file
deck.to_html("geospatial.html")

# 12. Inject a custom HTML legend into the exported HTML file
legend_html = """
<div id="legend" style="position: absolute; top: 20px; right: 20px; background: rgba(255,255,255,0.8); padding: 10px; border-radius: 5px; font-family: sans-serif; z-index: 9999;">
  <div><span style="display:inline-block; width:12px; height:12px; background: rgb(255,99,71); margin-right:5px;"></span>W10 Spadina-Fort York</div>
  <div><span style="display:inline-block; width:12px; height:12px; background: rgb(60,179,113); margin-right:5px;"></span>W11 University-Rosedale</div>
  <div><span style="display:inline-block; width:12px; height:12px; background: rgb(65,105,225); margin-right:5px;"></span>W13 Toronto Centre</div>
</div>
"""

html_file = "geospatial.html"
with open(html_file, "r", encoding="utf-8") as f:
    html_content = f.read()

# Insert the legend right after the <body> tag
modified_html = html_content.replace("<body>", "<body>" + legend_html, 1)

with open("geospatial_with_legend.html", "w", encoding="utf-8") as f:
    f.write(modified_html)

print("Map with legend saved as 'geospatial_with_legend.html'")
pydeck
Map with legend saved as 'geospatial_with_legend.html'
In [24]:
import folium
import branca.colormap as cm

# Compute the average condo price for each neighbourhood from the DataFrame (df)
avg_price = df.groupby("AREA_NAME")["price"].mean().reset_index()

# Merge the average prices into the neighbourhood GeoDataFrame
# (Assuming that the common column is named "neighbourhood")
neigh = neighborhoods.merge(avg_price, on="AREA_NAME", how="left")

# Determine the min and max average price for the color scale
min_price = neigh["price"].min()
max_price = neigh["price"].max()

# Create a linear colormap ranging from red (low price) to blue (high price)
colormap = cm.LinearColormap(colors=["red", "blue"], vmin=min_price, vmax=max_price)
colormap.caption = "Average Condo Price (C$)"

# Initialize a Folium map centered on Toronto (latitude 43.65, longitude -79.38)
m = folium.Map(location=[43.65, -79.38], zoom_start=12)

# Add the neighbourhood polygons as a GeoJson layer
folium.GeoJson(
    neigh,
    style_function=lambda feature: {
        "fillColor": colormap(feature["properties"]["price"])
                     if feature["properties"]["price"] is not None else "gray",
        "color": "black",
        "weight": 1,
        "fillOpacity": 0.7,
    },
    tooltip=folium.features.GeoJsonTooltip(
        fields=["AREA_NAME", "price"],
        aliases=["Neighbourhood:", "Avg Price:"],
        localize=True,
    ),
).add_to(m)

# Add the colormap to the map as a legend
colormap.add_to(m)

# Display the map (in a Jupyter Notebook this will render the interactive map)
m
Out[24]:
Make this Notebook Trusted to load map: File -> Trust Notebook

Basic Model¶

Simple Linear Model¶

In [25]:
model = smf.ols('price ~ ward + beds + baths + DEN + size + parking + exposure + D_mkt + building_age + maint + lt + lg', data=df).fit()
print(model.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  price   R-squared:                       0.920
Model:                            OLS   Adj. R-squared:                  0.920
Method:                 Least Squares   F-statistic:                     1431.
Date:                Sun, 02 Mar 2025   Prob (F-statistic):               0.00
Time:                        15:44:10   Log-Likelihood:                -36798.
No. Observations:                2749   AIC:                         7.364e+04
Df Residuals:                    2726   BIC:                         7.378e+04
Df Model:                          22                                         
Covariance Type:            nonrobust                                         
===========================================================================================
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                3.199e+07   3.03e+07      1.055      0.291   -2.74e+07    9.14e+07
ward[T.W11]             -6528.5448   1.19e+04     -0.547      0.585   -2.99e+04    1.69e+04
ward[T.W13]             -2.337e+04    1.3e+04     -1.795      0.073   -4.89e+04    2160.646
DEN[T.With Den]          1.836e+04   6494.343      2.828      0.005    5628.526    3.11e+04
size[T.1000-1499 sqft]   3.172e+05   1.29e+04     24.646      0.000    2.92e+05    3.42e+05
size[T.1500-1999 sqft]   6.028e+05   1.84e+04     32.776      0.000    5.67e+05    6.39e+05
size[T.2000-2499 sqft]   8.711e+05   2.56e+04     33.990      0.000    8.21e+05    9.21e+05
size[T.2500-2999 sqft]   9.987e+05   4.03e+04     24.756      0.000     9.2e+05    1.08e+06
size[T.3000-3499 sqft]    1.19e+06   4.82e+04     24.667      0.000     1.1e+06    1.28e+06
size[T.4000+ sqft]       1.759e+06    6.4e+04     27.494      0.000    1.63e+06    1.88e+06
size[T.500-999 sqft]     9.659e+04   8204.931     11.772      0.000    8.05e+04    1.13e+05
size[T.5500-3999 sqft]   1.632e+06   4.87e+04     33.539      0.000    1.54e+06    1.73e+06
parking[T.With Parking]  4577.7335   6136.838      0.746      0.456   -7455.590    1.66e+04
exposure[T.North]       -2909.4908   9509.430     -0.306      0.760   -2.16e+04    1.57e+04
exposure[T.South]       -9786.8494   9249.280     -1.058      0.290   -2.79e+04    8349.459
exposure[T.West]        -2.368e+04   1.09e+04     -2.174      0.030    -4.5e+04   -2325.346
beds                     7.309e+04   7040.570     10.382      0.000    5.93e+04    8.69e+04
baths                   -4895.5616   7338.499     -0.667      0.505   -1.93e+04    9494.021
D_mkt                      72.4343    220.141      0.329      0.742    -359.226     504.095
building_age              -83.8346    308.142     -0.272      0.786    -688.050     520.380
maint                     550.1355     11.387     48.314      0.000     527.808     572.463
lt                       -3.53e+05   3.99e+05     -0.886      0.376   -1.13e+06    4.28e+05
lg                       2.062e+05   2.15e+05      0.957      0.339   -2.16e+05    6.28e+05
==============================================================================
Omnibus:                      382.375   Durbin-Watson:                   2.016
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             3646.035
Skew:                           0.323   Prob(JB):                         0.00
Kurtosis:                       8.605   Cond. No.                     9.11e+06
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 9.11e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
In [25]: